import pandas as pd
%matplotlib inline
data_train = pd.read_csv('train.csv')
data_train.info()
data_train.describe()
data_train.head()
data_test = pd.read_csv('test.csv')
data_test.info()
data_test.head()
# data_train.drop(['Ticket', 'Cabin'], axis = 1)
# data_train.dropna()
#coding:utf-8
import matplotlib.pyplot as plt
# 用来正常显示中文标签
plt.rcParams['font.sans-serif'] = ['SimHei']
# 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False
fig = plt.figure(figsize=(18,6), dpi=1600)
# 设定图表颜色alpha参数
fig.set(alpha=0.2)
# 在一张大图里分列几个小图
plt.subplot2grid((2,3),(0,0))
data_train.Survived.value_counts().plot(kind='bar')# plots a bar graph of those who surived vs those who did not.
plt.title(u"获救情况 (1为获救)")
plt.ylabel(u"人数")
plt.subplot2grid((2,3),(0,1))
data_train.Pclass.value_counts().plot(kind="bar")
plt.ylabel(u"人数")
plt.title(u"乘客等级分布")
plt.subplot2grid((2,3),(0,2))
plt.scatter(data_train.Survived, data_train.Age)
plt.ylabel(u"年龄")
plt.grid(b=True, which='major', axis='y')
plt.title(u"按年龄看获救分布 (1为获救)")
plt.subplot2grid((2,3),(1,0), colspan=2)
data_train.Age[data_train.Pclass == 1].plot(kind='kde')
data_train.Age[data_train.Pclass == 2].plot(kind='kde')
data_train.Age[data_train.Pclass == 3].plot(kind='kde')
plt.xlabel(u"年龄")
plt.ylabel(u"密度")
plt.title(u"各等级的乘客年龄分布")
plt.legend((u'头等舱', u'2等舱',u'3等舱'),loc='best')
plt.subplot2grid((2,3),(1,2))
data_train.Embarked.value_counts().plot(kind='bar')
plt.title(u"各登船口岸上船人数")
plt.ylabel(u"人数")
plt.show()
# 各乘客等级的获救情况
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
Survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts()
df = pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u"各乘客等级的获救情况")
plt.xlabel(u"乘客等级")
plt.ylabel(u"人数")
plt.show()
# 各登录港口的获救情况
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
Survived_0 = data_train.Embarked[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Embarked[data_train.Survived == 1].value_counts()
df = pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u"各登录港口乘客的获救情况")
plt.xlabel(u"登录港口")
plt.ylabel(u"人数")
plt.show()
# 各性别的获救情况
fig = plt.figure()
fig.set(alpha=0.2) # 设定图表颜色alpha参数
Survived_m = data_train.Survived[data_train.Sex == 'male'].value_counts()
Survived_f = data_train.Survived[data_train.Sex == 'female'].value_counts()
df = pd.DataFrame({u'男性':Survived_m, u'女性':Survived_f})
df.plot(kind='bar', stacked=True)
plt.title(u"获救情况(1=获救)")
plt.xlabel(u"性别")
plt.ylabel(u"人数")
plt.show()
# 各种舱级别情况下各性别的获救情况
fig = plt.figure(figsize=(18,4), dpi=1600)
fig.set(alpha=0.2)
plt.title(u"根据舱等级和性别的获救情况")
ax1 = fig.add_subplot(141)
female_highclass = data_train.Survived[data_train.Sex == 'female'][data_train.Pclass != 3].value_counts()
female_highclass = female_highclass.plot(kind='bar', label=u"女性/高级舱", color='#FA2479')
ax1.set_xticklabels([u"获救", u"未获救"], rotation=0)
plt.legend(loc='best')
ax2 = fig.add_subplot(142, sharey=ax1)
female_lowclass = data_train.Survived[data_train.Sex == 'female'][data_train.Pclass == 3].value_counts()
female_lowclass = female_lowclass.plot(kind='bar', label=u"女性/低级舱", color='pink')
ax2.set_xticklabels([u"未获救", u"获救"], rotation=0)
plt.legend(loc='best')
ax3 = fig.add_subplot(143, sharey=ax1)
male_lowclass = data_train.Survived[data_train.Sex == 'male'][data_train.Pclass != 3].value_counts()
male_lowclass = male_lowclass.plot(kind='bar', label=u"男性/高级舱",color='lightblue')
ax3.set_xticklabels([u"未获救", u"获救"], rotation=0)
plt.legend(loc='best')
ax4 = fig.add_subplot(144, sharey=ax1)
male_highclass = data_train.Survived[data_train.Sex == 'male'][data_train.Pclass == 3].value_counts()
male_highclass = male_highclass.plot(kind='bar', label=u"男性/低级舱", color='steelblue')
ax4.set_xticklabels([u"未获救", u"获救"], rotation=0)
plt.legend(loc='best')
plt.show()
group = data_train.groupby(['SibSp','Survived'])
df = pd.DataFrame(group.count()['PassengerId'])
df.plot(kind='bar', stacked=True)
group = data_train.groupby(['Parch','Survived'])
df = pd.DataFrame(group.count()['PassengerId'])
df.plot(kind='bar')
X_train = data_train.drop("Survived",axis=1)
Y_train = data_train["Survived"]
X_test = data_test.drop("PassengerId",axis=1).copy()
# X_test = data_test.drop("PassengerId",axis=1)
data_train.describe(include=['O'])
data_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)
data_train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)
data_train[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean()
data_train[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived')
data_train[['Age', 'Survived']].groupby(['Age'], as_index=False).mean()
fig = plt.figure(figsize=(18,6), dpi=1600)
fig.set(alpha=0.2)
Survived_0 = data_train.Age[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Age[data_train.Survived == 1].value_counts()
df=pd.DataFrame({u'获救':Survived_1, u'未获救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u"各年龄段的获救情况")
plt.xlabel(u"年龄")
plt.ylabel(u"人数")
plt.show()
data_train = data_train.drop(['Ticket', 'Cabin'], axis=1)
data_train.shape
data_test.shape
data_test = data_test.drop(['Ticket', 'Cabin'], axis=1)
data_test.shape
data_train = data_train.drop(['Name', 'PassengerId'], axis=1)
data_test = data_test.drop(['Name'], axis=1)
data_test.shape
data_train["Embarked"] = data_train["Embarked"].fillna("S")
data_train['Embarked'] = data_train['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
data_train.head()
data_train['Sex'] = data_train['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
data_train.head()
X_train = data_train.drop("Survived", axis=1)
Y_train = data_train["Survived"]
X_test = data_test.drop("PassengerId", axis=1).copy()
data_train.shape
data_train
data_test['Embarked'] = data_test['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
data_test['Sex'] = data_test['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
data_test.head()
import numpy as np
guess_ages = np.zeros((2,3))
for i in range(0, 2):
for j in range(0, 3):
guess_median = data_train[(data_train['Sex'] == i) & \
(data_train['Pclass'] == j+1)]['Age'].dropna().median()
guess_ages[i,j] = int( guess_median/0.5 + 0.5 ) * 0.5
guess_ages
for i in range(0, 2):
for j in range(0, 3):
data_train.loc[ (data_train.Age.isnull()) & (data_train.Sex == i) & (data_train.Pclass == j+1), 'Age'] = guess_ages[i,j]
data_train['Age'] = data_train['Age'].astype(int)
# data_train[ data_train['Age'].isnull() ][['Sex','Pclass','Age']].head(10)
data_train
guess_test_ages = np.zeros((2,3))
for i in range(0, 2):
for j in range(0, 3):
guess_test_median = data_test[(data_test['Sex'] == i) & \
(data_test['Pclass'] == j+1)]['Age'].dropna().median()
guess_test_ages[i,j] = int( guess_test_median/0.5 + 0.5 ) * 0.5
guess_test_ages
for i in range(0, 2):
for j in range(0, 3):
data_test.loc[ (data_test.Age.isnull()) & (data_test.Sex == i) & (data_test.Pclass == j+1), 'Age'] = guess_test_ages[i,j]
data_test['Age'] = data_test['Age'].astype(int)
data_test
X_train = data_train.drop("Survived", axis=1)
Y_train = data_train["Survived"]
X_test = data_test.drop("PassengerId", axis=1).copy()
X_train.describe()
data_train.loc[ data_train['Fare'] <= 7.91, 'Fare'] = 0
data_train.loc[(data_train['Fare'] > 7.91) & (data_train['Fare'] <= 14.454), 'Fare'] = 1
data_train.loc[(data_train['Fare'] > 14.454) & (data_train['Fare'] <= 31), 'Fare'] = 2
data_train.loc[ data_train['Fare'] > 31, 'Fare'] = 3
data_train['Fare'] = data_train['Fare'].astype(int)
data_train
data_test[ data_test['Fare'].isnull() ][['Sex','Pclass','Age','Fare']].head()
data_test['Fare'].fillna(data_test['Fare'].dropna().median(), inplace=True)
data_test.loc[ data_test['Fare'] <= 7.91, 'Fare'] = 0
data_test.loc[(data_test['Fare'] > 7.91) & (data_test['Fare'] <= 14.454), 'Fare'] = 1
data_test.loc[(data_test['Fare'] > 14.454) & (data_test['Fare'] <= 31), 'Fare'] = 2
data_test.loc[ data_test['Fare'] > 31, 'Fare'] = 3
data_test['Fare'] = data_test['Fare'].astype(int)
data_test
X_train = data_train.drop("Survived", axis=1)
Y_train = data_train["Survived"]
X_test = data_test.drop("PassengerId", axis=1).copy()
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_logreg = logreg.score(X_train, Y_train)
acc_logreg
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = svc.score(X_train, Y_train)
acc_svc
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
# acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn = knn.score(X_train, Y_train)
acc_knn
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = gaussian.score(X_train, Y_train)
acc_gaussian
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = linear_svc.score(X_train, Y_train)
acc_linear_svc
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = decision_tree.score(X_train, Y_train)
acc_decision_tree
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_rf_pred = random_forest.predict(X_test)
acc_random_forest = random_forest.score(X_train, Y_train)
acc_random_forest
models = pd.DataFrame({
'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression',
'Random Forest', 'Naive Bayes', 'Linear SVC', 'Decision Tree'],
'Score': [acc_svc, acc_knn, acc_logreg,
acc_random_forest, acc_gaussian,
acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)
submission = pd.DataFrame({
"PassengerId": data_test["PassengerId"],
"Survived": Y_rf_pred
})
submission.to_csv('submission.csv', index=False)